{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "source": [
    "from datasets import load_dataset"
   ],
   "metadata": {
    "id": "sRZijM_Z-0q0"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "import pandas as pd"
   ],
   "metadata": {
    "id": "iD_RXcCo-6SL"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "ds = load_dataset(\"iapp_wiki_qa_squad\")"
   ],
   "metadata": {
    "id": "BQePnfRo-06K"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "train = ds[\"train\"]"
   ],
   "metadata": {
    "id": "ZWwvZgpy-2oY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "df = pd.DataFrame.from_dict(\n",
    "    {\n",
    "        \"INSTRUCTION\": train[\"question\"],\n",
    "        \"RESPONSE\": [i[\"text\"][0] for i in train[\"answers\"]],\n",
    "        \"SOURCE\": [\"wikipedia\"] * len(train[\"answers\"]),\n",
    "    }\n",
    ")"
   ],
   "metadata": {
    "id": "0sINkYJR_PQ4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "df"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 424
    },
    "id": "MXIQwhKk_n6Q",
    "outputId": "bfd7aa0d-d4b8-4a34-bab4-2b5bfdf4d4e3"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                                            INSTRUCTION  \\\n",
       "0             พัทธ์ธีรา ศรุติพงศ์โภคิน เกิดวันที่เท่าไร   \n",
       "1            พัทธ์ธีรา ศรุติพงศ์โภคิน มีฃื่อเล่นว่าอะไร   \n",
       "2                  พัทธ์ธีรา ศรุติพงศ์โภคิน ทำอาชีพอะไร   \n",
       "3      พัทธ์ธีรา ศรุติพงศ์โภคิน จบการศึกษาจากประเทศอะไร   \n",
       "4                              บิดาของคลีโอพัตราเป็นใคร   \n",
       "...                                                 ...   \n",
       "5756  จำนวนผู้หญิงที่แก้ไขข้อมูลบนวิกิพีเดียคิดเป็นร...   \n",
       "5757  จำนวนผู้หญิงสหรัฐที่แก้ไขข้อมูลบนวิกิพีเดียคิด...   \n",
       "5758        ปลาตะกากมีชื่อเรียกทางวิทยาศาสตร์ว่าอย่างไร   \n",
       "5759      ปลาตะกากถือว่าเป็นปลาน้ำจืดที่จัดอยู่ในวงศ์ใด   \n",
       "5760  ปลาตะกากมักพบเลี้ยงเป็นปลาสวยงามมีชื่อในวงการว...   \n",
       "\n",
       "                                     RESPONSE     SOURCE  \n",
       "0                         3 ธันวาคม พ.ศ. 2533  wikipedia  \n",
       "1                                          อร  wikipedia  \n",
       "2                           นักแสดงหญิงชาวไทย  wikipedia  \n",
       "3                            ประเทศนิวซีแลนด์  wikipedia  \n",
       "4                        ทอเลมีที่ 12 ออเลติส  wikipedia  \n",
       "...                                       ...        ...  \n",
       "5756                                 8.5 – 16  wikipedia  \n",
       "5757                                    22.7%  wikipedia  \n",
       "5758  ชื่อวิทยาศาสตร์ว่า Cosmochilus harmandi  wikipedia  \n",
       "5759              วงศ์ปลาตะเพียน (Cyprinidae)  wikipedia  \n",
       "5760                            กระมังครีบสูง  wikipedia  \n",
       "\n",
       "[5761 rows x 3 columns]"
      ],
      "text/html": [
       "\n",
       "  <div id=\"df-d27615db-1f71-4ae0-98fe-e0b3123bef9c\">\n",
       "    <div class=\"colab-df-container\">\n",
       "      <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>INSTRUCTION</th>\n",
       "      <th>RESPONSE</th>\n",
       "      <th>SOURCE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>พัทธ์ธีรา ศรุติพงศ์โภคิน เกิดวันที่เท่าไร</td>\n",
       "      <td>3 ธันวาคม พ.ศ. 2533</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>พัทธ์ธีรา ศรุติพงศ์โภคิน มีฃื่อเล่นว่าอะไร</td>\n",
       "      <td>อร</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>พัทธ์ธีรา ศรุติพงศ์โภคิน ทำอาชีพอะไร</td>\n",
       "      <td>นักแสดงหญิงชาวไทย</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>พัทธ์ธีรา ศรุติพงศ์โภคิน จบการศึกษาจากประเทศอะไร</td>\n",
       "      <td>ประเทศนิวซีแลนด์</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>บิดาของคลีโอพัตราเป็นใคร</td>\n",
       "      <td>ทอเลมีที่ 12 ออเลติส</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5756</th>\n",
       "      <td>จำนวนผู้หญิงที่แก้ไขข้อมูลบนวิกิพีเดียคิดเป็นร...</td>\n",
       "      <td>8.5 – 16</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5757</th>\n",
       "      <td>จำนวนผู้หญิงสหรัฐที่แก้ไขข้อมูลบนวิกิพีเดียคิด...</td>\n",
       "      <td>22.7%</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5758</th>\n",
       "      <td>ปลาตะกากมีชื่อเรียกทางวิทยาศาสตร์ว่าอย่างไร</td>\n",
       "      <td>ชื่อวิทยาศาสตร์ว่า Cosmochilus harmandi</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5759</th>\n",
       "      <td>ปลาตะกากถือว่าเป็นปลาน้ำจืดที่จัดอยู่ในวงศ์ใด</td>\n",
       "      <td>วงศ์ปลาตะเพียน (Cyprinidae)</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5760</th>\n",
       "      <td>ปลาตะกากมักพบเลี้ยงเป็นปลาสวยงามมีชื่อในวงการว...</td>\n",
       "      <td>กระมังครีบสูง</td>\n",
       "      <td>wikipedia</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5761 rows × 3 columns</p>\n",
       "</div>\n",
       "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d27615db-1f71-4ae0-98fe-e0b3123bef9c')\"\n",
       "              title=\"Convert this dataframe to an interactive table.\"\n",
       "              style=\"display:none;\">\n",
       "        \n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "       width=\"24px\">\n",
       "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
       "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
       "  </svg>\n",
       "      </button>\n",
       "      \n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      flex-wrap:wrap;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "      <script>\n",
       "        const buttonEl =\n",
       "          document.querySelector('#df-d27615db-1f71-4ae0-98fe-e0b3123bef9c button.colab-df-convert');\n",
       "        buttonEl.style.display =\n",
       "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "        async function convertToInteractive(key) {\n",
       "          const element = document.querySelector('#df-d27615db-1f71-4ae0-98fe-e0b3123bef9c');\n",
       "          const dataTable =\n",
       "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                     [key], {});\n",
       "          if (!dataTable) return;\n",
       "\n",
       "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "            + ' to learn more about interactive tables.';\n",
       "          element.innerHTML = '';\n",
       "          dataTable['output_type'] = 'display_data';\n",
       "          await google.colab.output.renderOutput(dataTable, element);\n",
       "          const docLink = document.createElement('div');\n",
       "          docLink.innerHTML = docLinkHtml;\n",
       "          element.appendChild(docLink);\n",
       "        }\n",
       "      </script>\n",
       "    </div>\n",
       "  </div>\n",
       "  "
      ]
     },
     "metadata": {},
     "execution_count": 12
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\")"
   ],
   "metadata": {
    "id": "bFNDTgDxAD-e"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "ds = Dataset.from_parquet(\"dataset.parquet\")"
   ],
   "metadata": {
    "id": "zSVjPVbqA4Vw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "ds.push_to_hub(\"wannaphong/iapp_wiki_qa_squad_oa\")"
   ],
   "metadata": {
    "id": "KWDRH0GGBVFZ"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}
